import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.io as pio
import seaborn as sns
pio.renderers.default = 'notebook'
pyo.init_notebook_mode()
sns.set_style('white')
customers_df = pd.read_excel('./customers.xlsx')
customers_df.loc[customers_df['customer_industry'] == 'Other', 'customer_industry'] = 'Other Industry'
customers_metrics_df = pd.read_excel('./customers_metrics.xlsx')
customers_metrics_df.date = pd.to_datetime(customers_metrics_df.date)
print("Customers dataset overview\n")
customers_df.sample(10)
Customers dataset overview
| customerID | customer_industry | customer_type | customer_region | |
|---|---|---|---|---|
| 76 | customer_77 | Rugs | Retailer | Americas |
| 63 | customer_64 | Flooring | Retailer | Europe |
| 27 | customer_28 | Flooring | Manufacturer | Europe |
| 57 | customer_58 | Flooring | Retailer | Americas |
| 9 | customer_10 | Rugs | Manufacturer | Europe |
| 75 | customer_76 | Flooring | Manufacturer | Americas |
| 28 | customer_29 | Tile | Manufacturer | Americas |
| 46 | customer_47 | Flooring | Manufacturer | Europe |
| 34 | customer_35 | Flooring | Manufacturer | Americas |
| 33 | customer_34 | Tile | Manufacturer | Europe |
labels = customers_df['customer_type'].unique().tolist() + \
customers_df['customer_industry'].unique().tolist() + \
customers_df['customer_region'].unique().tolist()
labels = {labels[i]:i for i in range(len(labels))}
first_lvl_group = customers_df[['customerID', 'customer_type', 'customer_industry']].groupby(['customer_type', 'customer_industry']).count()
second_lvl_group = customers_df[['customerID', 'customer_industry', 'customer_region']].groupby(['customer_industry', 'customer_region']).count()
data = {
'source': [],
'target': [],
'value': []
}
for i, row in first_lvl_group.iterrows():
data['source'] += [labels[i[0]]]
data['target'] += [labels[i[1]]]
data['value'] += [row['customerID']]
for i, row in second_lvl_group.iterrows():
data['source'] += [labels[i[0]]]
data['target'] += [labels[i[1]]]
data['value'] += [row['customerID']]
print(f'Unique customers count: {len(customers_df.customerID.unique())}')
print(f'Customer types: {customers_df.customer_type.unique()}')
print(f'Customer industries: {customers_df.customer_industry.unique()}')
print(f'Customer regions: {customers_df.customer_region.unique()}')
Unique customers count: 83 Customer types: ['Retailer' 'Manufacturer' 'Other'] Customer industries: ['Tile' 'Flooring' 'Rugs' 'Other Industry' 'Walls'] Customer regions: ['Asia' 'Europe' 'Americas' 'Africa' 'Oceania']
print("Empty values count by column:\n")
print(customers_df.isnull().sum())
Empty values count by column: customerID 0 customer_industry 0 customer_type 0 customer_region 0 dtype: int64
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 20,
thickness = 30,
line = dict(color = "black", width = 0.5),
label = list(labels.keys()),
color = (["#1d242d"] * 3) + (['#ff6732'] * 5) + (['#00f2ff'] * 5)
),
link = data
)])
fig.update_layout(title_text="Roomvo Customers Diagram ", font_size=12)
fig.show()
Initial Observations:¶
- Manufacturer is a dominant type of customer & Retailer takes the second place,
- Flooring is the most popular customer industry since most customers across every type are from there. Looking at the chart we could assume that customers from the Flooring industry are being attracted to Roomvo independent of their customer type,
- Observing relatively developed markets of Americas & Europe we could make a weak assumption that upon reaching a particular development threshold by the number of customers all markets develop in such a way that there are two dominant customer industries Flooring & Tile with Flooring prevailing over Tile.
Customers-Metrics¶
print("Customers-Metrics dataset overview\n")
customers_metrics_df.sample(10)
Customers-Metrics dataset overview
| customerID | date | usage_rate | num_sessions | num_product_views | uploaded_room_view_rate | total_time_spent_in_seconds | mobile_uploaded_room_view_rate | desktop_uploaded_room_view_rate | product_conversion_rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 473 | customer_16 | 2023-04-01 | 0.011368 | 449.806879 | 3351.981322 | 0.476777 | 148580.419257 | 0.650199 | 0.472524 | 0.048940 |
| 679 | customer_25 | 2023-01-01 | 0.140040 | 532.775424 | 6731.175419 | 0.869153 | 291544.058265 | 0.806706 | 0.436732 | 0.017231 |
| 96 | customer_53 | 2023-10-01 | 0.027159 | 283.045504 | 2147.885469 | 0.770484 | 120606.154264 | 1.000000 | 0.474553 | 0.036479 |
| 293 | customer_83 | 2023-05-01 | 0.051779 | 360.140378 | 2898.459430 | 0.938142 | 193161.682806 | 1.000000 | 0.641418 | 0.020625 |
| 935 | customer_50 | 2023-09-01 | 0.506347 | 211.599841 | 1653.816278 | 0.710519 | 81601.021985 | 1.000000 | 0.345999 | 0.020774 |
| 718 | customer_56 | 2023-03-01 | 0.007200 | 475.820727 | 4058.954738 | 0.702048 | 218375.663024 | 0.941442 | 0.570165 | 0.017733 |
| 132 | customer_13 | 2023-05-01 | 0.044434 | 374.516811 | 3032.379406 | 0.675073 | 132942.180488 | 0.544133 | 0.559635 | 0.021241 |
| 233 | customer_67 | 2023-07-01 | 0.029641 | 229.882342 | 2206.516893 | 0.368189 | 88923.143201 | 0.612836 | 0.209756 | 0.050328 |
| 225 | customer_05 | 2023-05-01 | 0.525342 | 352.916259 | 2634.795556 | 0.708026 | 101507.067682 | 0.908059 | 0.254048 | 0.007822 |
| 138 | customer_33 | 2023-07-01 | 0.047651 | 319.854845 | 4910.626328 | 0.433042 | 170598.529996 | 0.812536 | 0.460817 | 0.053578 |
print("Empty values count by column:\n")
print(customers_metrics_df.isnull().sum())
Empty values count by column: customerID 0 date 0 usage_rate 6 num_sessions 0 num_product_views 0 uploaded_room_view_rate 19 total_time_spent_in_seconds 0 mobile_uploaded_room_view_rate 32 desktop_uploaded_room_view_rate 24 product_conversion_rate 19 dtype: int64
We have some empty values in Customers-Metrcis. Let's make a heatmap by columns where dark lines mean filled values, and bright - missing values.
sns.heatmap(customers_metrics_df.isnull(), cbar=False)
plt.show()
customers_metrics_df = customers_metrics_df.fillna(0)
By looking at the heatmap, we can see that:
- If both
mobile&desktop_uploaded_room_view_rateare empty, thenuploaded_room_view_rate&product_coversion_rateare empty, - If at least one of
mobile&desktop_uploaded_room_view_rateis not empty, thenproduct_coversion_rateis not empty.
These observations prove that filling empty rows with zeroes won't break any business logic. Setting missing values to zero in this case would imply that the event did not occur (e.g., no usage or no product conversion).
Now let's proceed with statistical observations.
print("General statistics:")
print(f'Unique customers count: {len(customers_metrics_df.customerID.unique())}')
print(f"min date: {customers_metrics_df.date.dt.date.min()}, max date: {customers_metrics_df.date.dt.date.max()}\n")
customers_metrics_df.describe().round(2)
General statistics: Unique customers count: 83 min date: 2023-01-01, max date: 2023-12-01
| date | usage_rate | num_sessions | num_product_views | uploaded_room_view_rate | total_time_spent_in_seconds | mobile_uploaded_room_view_rate | desktop_uploaded_room_view_rate | product_conversion_rate | |
|---|---|---|---|---|---|---|---|---|---|
| count | 982 | 982.00 | 982.00 | 982.00 | 982.00 | 982.00 | 982.00 | 982.00 | 982.00 |
| mean | 2023-06-17 21:26:01.710794240 | 0.07 | 537.00 | 5063.89 | 0.73 | 285892.20 | 0.82 | 0.62 | 0.03 |
| min | 2023-01-01 00:00:00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 2023-04-01 00:00:00 | 0.01 | 208.06 | 1755.91 | 0.62 | 91313.36 | 0.74 | 0.46 | 0.01 |
| 50% | 2023-07-01 00:00:00 | 0.03 | 302.11 | 2908.69 | 0.77 | 157721.05 | 0.86 | 0.65 | 0.03 |
| 75% | 2023-10-01 00:00:00 | 0.07 | 446.01 | 4701.78 | 0.91 | 255700.09 | 0.97 | 0.81 | 0.04 |
| max | 2023-12-01 00:00:00 | 0.96 | 24716.83 | 255142.00 | 1.00 | 12259473.43 | 1.00 | 1.00 | 0.37 |
| std | NaN | 0.13 | 1736.76 | 15172.88 | 0.23 | 957634.30 | 0.21 | 0.26 | 0.03 |
Initial Observations:¶
- The high variance in columns like
num_sessions,num_product_views, andtotal_time_spent_in_secondssuggests the presence of high-value anomalies which is also confirmed by quantile values of the selected columns, - The general level of conversion is relatively small (~3%)
Before moving to Product Performance Analysis, let's identify the anomalies with IQR and investigate them more properly.
def detect_anomalies(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
anomalies = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
return anomalies
anomalies_usage_rate = detect_anomalies(customers_metrics_df, 'usage_rate')
anomalies_num_sessions = detect_anomalies(customers_metrics_df, 'num_sessions')
anomalies_total_time = detect_anomalies(customers_metrics_df, 'total_time_spent_in_seconds')
anomalies_conversion_rate = detect_anomalies(customers_metrics_df, 'product_conversion_rate')
anomaly_summary = {
'Metric': ['Usage Rate', 'Number of Sessions', 'Total Time Spent', 'Conversion Rate'],
'Number of Anomalies': [
anomalies_usage_rate.shape[0],
anomalies_num_sessions.shape[0],
anomalies_total_time.shape[0],
anomalies_conversion_rate.shape[0]
]
}
anomaly_summary_df = pd.DataFrame(anomaly_summary)
anomaly_summary_df
| Metric | Number of Anomalies | |
|---|---|---|
| 0 | Usage Rate | 94 |
| 1 | Number of Sessions | 44 |
| 2 | Total Time Spent | 44 |
| 3 | Conversion Rate | 28 |
anomalies_sessions_merged = pd.merge(anomalies_num_sessions, customers_df, on='customerID', how='left')
anomaly_sessions_industry = anomalies_sessions_merged['customer_industry'].value_counts()
anomaly_sessions_type = anomalies_sessions_merged['customer_type'].value_counts()
anomaly_sessions_region = anomalies_sessions_merged['customer_region'].value_counts()
fig, axs = plt.subplots(3, 1, figsize=(10, 15))
sns.barplot(x=anomaly_sessions_industry.values, y=anomaly_sessions_industry.index, ax=axs[0], palette=["#1d242d", '#ff6732', '#00f2ff', '#6e6e6e'])
axs[0].set_title('Anomalies in Session Counts by Customer Industry')
axs[0].set_xlabel('Number of Anomalies')
axs[0].set_ylabel('Industry')
sns.barplot(x=anomaly_sessions_type.values, y=anomaly_sessions_type.index, ax=axs[1], palette=["#1d242d", '#ff6732', '#00f2ff'])
axs[1].set_title('Anomalies in Session Counts by Customer Type')
axs[1].set_xlabel('Number of Anomalies')
axs[1].set_ylabel('Customer Type')
sns.barplot(x=anomaly_sessions_region.values, y=anomaly_sessions_region.index, ax=axs[2], palette=["#1d242d", '#ff6732', '#00f2ff'])
axs[2].set_title('Anomalies in Session Counts by Customer Region')
axs[2].set_xlabel('Number of Anomalies')
axs[2].set_ylabel('Region')
plt.tight_layout()
plt.show()
General Patterns for Anomalies:¶
- The Flooring and Rugs industries have the highest number of anomalies, suggesting these industries might have specific usage patterns or needs that lead to unusual session counts,
- Retailers dominate in terms of session count anomalies, indicating potential differences in how retailers versus Manufacturers use the Roomvo app,
- Oceania and the Americas show a higher tendency for anomalous session counts, which might be influenced by regional market characteristics or user behaviour.
2. Product Performance Analysis¶
A) Let's start by proposing 2 KPIs for product utilization:
- Repeat Usage Rate. This KPI assesses the rate at which users return to use the app within a given timeframe. This metric helps in understanding user loyalty and the app's ability to engage users over time:
$$ \text{Repeat Usage Rate} = \left( \frac{\text{Number of Unique Users with Multiple Sessions in a Month}}{\text{Total Number of Unique Users in the Month}} \right) \times 100\% $$
repeat_usage_data = customers_metrics_df.groupby(['customerID', customers_metrics_df['date'].dt.to_period('M')]).agg({'num_sessions': 'sum'}).reset_index()
repeat_usage_data['multiple_sessions'] = repeat_usage_data['num_sessions'] > 1
repeat_usage_summary = repeat_usage_data.groupby('date').agg(
total_users=('customerID', 'nunique'),
users_with_multiple_sessions=('multiple_sessions', 'sum')
).reset_index()
repeat_usage_summary['repeat_usage_rate'] = (repeat_usage_summary['users_with_multiple_sessions'] / repeat_usage_summary['total_users']) * 100
repeat_usage_summary[['date', 'repeat_usage_rate']]
| date | repeat_usage_rate | |
|---|---|---|
| 0 | 2023-01 | 98.750000 |
| 1 | 2023-02 | 98.750000 |
| 2 | 2023-03 | 98.765432 |
| 3 | 2023-04 | 100.000000 |
| 4 | 2023-05 | 100.000000 |
| 5 | 2023-06 | 98.780488 |
| 6 | 2023-07 | 98.780488 |
| 7 | 2023-08 | 95.180723 |
| 8 | 2023-09 | 96.385542 |
| 9 | 2023-10 | 97.590361 |
| 10 | 2023-11 | 96.385542 |
| 11 | 2023-12 | 95.180723 |
Obtained high values suggest strong user engagement and frequent utilization of the Roomvo app.
- Daily Active Users (DAU) to Monthly Active Users (MAU) Ratio. This KPI measures user engagement by comparing the number of unique daily active users to the number of unique monthly active users. A higher DAU/MAU ratio indicates strong user engagement and regular utilization of the product.
$$ \text{DAU/MAU Ratio} = \frac{\text{Daily Active Users (DAU)}}{\text{Monthly Active Users (MAU)}} $$
Unfortunately, the dataset does not include daily user activity data, therefore we cannot calculate this metric.
B) Now we need to derive Average time spent per product view and Average number of product views per session. Then we will analyze the relationship between the two metrics.
customers_metrics_df['avg_time_per_view'] = customers_metrics_df['total_time_spent_in_seconds'] / customers_metrics_df['num_product_views']
customers_metrics_df['avg_views_per_session'] = customers_metrics_df['num_product_views'] / customers_metrics_df['num_sessions']
metrics_time_aggregated_df = customers_metrics_df.groupby('date').mean(numeric_only=True)[['avg_time_per_view', 'avg_views_per_session']]
plt.figure(figsize=(18, 6))
plt.subplot(1, 2, 1)
plt.plot(metrics_time_aggregated_df['avg_time_per_view'], color="#1d242d", marker='o')
plt.title('Average Time per Product View Over Time')
plt.xlabel('Date')
plt.ylabel('Average Time (seconds)')
plt.grid()
plt.subplot(1, 2, 2)
plt.plot(metrics_time_aggregated_df['avg_views_per_session'], color='#ff6732', marker='o'),
plt.title('Average Product Views per Session Over Time')
plt.xlabel('Date')
plt.ylabel('Average Views per Session')
plt.tight_layout()
plt.grid()
plt.show()
relationship_analysis = customers_metrics_df[['avg_time_per_view', 'avg_views_per_session']].corr()
print("\nPearson correlation coefficient:", relationship_analysis.iloc[0, 1].round(3),'\n')
kpi_descriptive_stats = customers_metrics_df[['avg_time_per_view', 'avg_views_per_session']].describe()
kpi_descriptive_stats
Pearson correlation coefficient: -0.353
| avg_time_per_view | avg_views_per_session | |
|---|---|---|
| count | 963.000000 | 963.000000 |
| mean | 57.402148 | 10.021939 |
| std | 26.175744 | 4.387647 |
| min | 7.227447 | 1.520970 |
| 25% | 43.091456 | 7.184736 |
| 50% | 52.925513 | 9.397827 |
| 75% | 65.842346 | 12.030127 |
| max | 461.923281 | 69.349206 |
A negative correlation of approximately -0.35 suggests that as the average number of product views per session increases, the average time spent per product view tends to decrease.
This could indicate that users browsing more products in a session spend less time on each product.
C) Analyzing the uploaded room view rate on mobile devices and desktops by visualizing their distributions.
plt.figure(figsize=(15, 6))
sns.kdeplot(customers_metrics_df['mobile_uploaded_room_view_rate'], fill=True, label='Mobile', color="#ff6732")
sns.kdeplot(customers_metrics_df['desktop_uploaded_room_view_rate'], fill=True, label='Desktop', color="#1d242d")
plt.title('Uploaded Room View Rates: Mobile vs Desktop')
plt.xlabel('Upload Rate')
plt.ylabel('Density')
plt.legend()
plt.show()
Mobile users generally have a higher rate of uploading their own room images compared to desktop users. This could be due to the convenience of taking pictures with mobile devices.
3. Customer Insights¶
Let's visualize the dependency between usage_rate & industry/type/region to find insights.
merged_df = pd.merge(customers_df, customers_metrics_df, on='customerID', how='inner')
detailed_usage_by_industry = merged_df.groupby('customer_industry')['usage_rate'].agg(['mean', 'std', 'min', 'max', 'count'])
detailed_usage_by_type = merged_df.groupby('customer_type')['usage_rate'].agg(['mean', 'std', 'min', 'max', 'count'])
detailed_usage_by_region = merged_df.groupby('customer_region')['usage_rate'].agg(['mean', 'std', 'min', 'max', 'count'])
detailed_usage_by_industry, detailed_usage_by_type, detailed_usage_by_region
( mean std min max count
customer_industry
Flooring 0.099160 0.152578 0.000000 0.874305 574
Other Industry 0.044886 0.157498 0.000595 0.692941 19
Rugs 0.045434 0.093894 0.000000 0.962434 120
Tile 0.027871 0.028252 0.000000 0.355055 245
Walls 0.026255 0.012202 0.001831 0.047743 24,
mean std min max count
customer_type
Manufacturer 0.073596 0.126012 0.000000 0.962434 610
Other 0.036108 0.025964 0.002741 0.080749 24
Retailer 0.071612 0.135429 0.000000 0.809821 348,
mean std min max count
customer_region
Africa 0.029369 0.021414 0.004189 0.073918 36
Americas 0.065726 0.113587 0.000000 0.809821 413
Asia 0.071014 0.142280 0.000000 0.818534 84
Europe 0.087015 0.149688 0.000000 0.962434 370
Oceania 0.054664 0.088712 0.001350 0.692941 79)
fig, axs = plt.subplots(3, 1, figsize=(10, 15))
sns.barplot(x='mean', y=detailed_usage_by_industry.index, ax=axs[0], data=detailed_usage_by_industry, palette=["#1d242d", '#ff6732', '#00f2ff', '#fbdb88', '#6e6e6e'])
axs[0].set_title('Average Usage Rate by Customer Industry')
axs[0].set_xlabel('Average Usage Rate')
axs[0].set_ylabel('Industry')
sns.barplot(x='mean', y=detailed_usage_by_type.index, ax=axs[1], data=detailed_usage_by_type, palette=["#1d242d", '#ff6732', '#00f2ff'])
axs[1].set_title('Average Usage Rate by Customer Type')
axs[1].set_xlabel('Average Usage Rate')
axs[1].set_ylabel('Type')
sns.barplot(x='mean', y=detailed_usage_by_region.index, ax=axs[2], data=detailed_usage_by_region, palette=["#1d242d", '#ff6732', '#00f2ff', '#fbdb88', '#6e6e6e'])
axs[2].set_title('Average Usage Rate by Customer Region')
axs[2].set_xlabel('Average Usage Rate')
axs[2].set_ylabel('Region')
plt.tight_layout()
plt.show()
Observations:¶
- Flooring industry exhibits notably higher usage, suggesting greater relevance or better fit of the Roomvo app for this industry,
- Manufacturers and Retailers show similar average usage rates, significantly higher than 'Other' types.,
- Europe leads in usage rate among regions, with Africa showing the lowest average usage.
4. Recomendations and Insights¶
- Target High-Engagement Industries and Regions
- Focus marketing and sales efforts on industries with higher usage rates, particularly the Flooring industry,
- Prioritize regions like Europe and Asia, where usage rates are higher, for expansion or targeted campaigns.
- Improve Engagement in Low-Usage Demographics
- Investigate reasons for lower usage rates in industries like Tile and Walls and regions like Africa,
- Develop tailored strategies or features to increase adoption and engagement in these areas.
- Leverage Mobile Platform Strengths
- With mobile users showing a higher rate of uploading their own room images, enhance mobile app features to capitalize on this behaviour.
- Customize Approach Based on Customer Type
- Recognize the different usage patterns between Manufacturers, Retailers and Other customer types,
- Develop specific strategies or features that cater to the unique needs of each customer type.